Source Code of org.terrier.indexing.BasicSinglePassIndexer


/*
 * Terrier - Terabyte Retriever 
 * Webpage: http://terrier.org 
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.ac.uk/
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is BasicSinglePassIndexer.java.
 *
 * The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *  Roi Blanco
 *  Craig Macdonald
 */


package org.terrier.indexing;


import java.io.IOException;
import java.util.LinkedList;
import java.util.Map;
import java.util.Queue;


import org.terrier.structures.BasicLexiconEntry;
import org.terrier.structures.DocumentIndexEntry;
import org.terrier.structures.FSOMapFileLexiconOutputStream;
import org.terrier.structures.FieldDocumentIndexEntry;
import org.terrier.structures.FieldLexiconEntry;
import org.terrier.structures.Index;
import org.terrier.structures.LexiconOutputStream;
import org.terrier.structures.SimpleDocumentIndexEntry;
import org.terrier.structures.indexing.DocumentIndexBuilder;
import org.terrier.structures.indexing.DocumentPostingList;
import org.terrier.structures.indexing.singlepass.FieldPostingInRun;
import org.terrier.structures.indexing.singlepass.FieldsMemoryPostings;
import org.terrier.structures.indexing.singlepass.FileRunIteratorFactory;
import org.terrier.structures.indexing.singlepass.MemoryPostings;
import org.terrier.structures.indexing.singlepass.RunsMerger;
import org.terrier.structures.indexing.singlepass.SimplePostingInRun;
import org.terrier.structures.postings.BasicIterablePosting;
import org.terrier.structures.postings.FieldIterablePosting;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
import org.terrier.utility.FieldScore;
import org.terrier.utility.Files;
import org.terrier.utility.MemoryChecker;
import org.terrier.utility.RuntimeMemoryChecker;
/**
 * This class indexes a document collection (skipping the direct file construction). It implements a single-pass algorithm,
 * that operates in two phases:<br>
 * First, it traverses the document collection, passes the terms through the TermPipeline and builds an in-memory
 * representation of the posting lists. When it has exhausted the main memory, it flushes the sorted postings to disk, along
 * with the lexicon (collectively known as a <i>run</i>, and continues traversing the collection.<br>
 * The second phase, merges the sorted runs (with their partial lexicons) in disk to create the final inverted file.
 * This class follows the template pattern, so the main bulk of the code is reused for block (and fields) indexing. There are a few hook methods,
 * that chooses the right classes to instanciate, depending on the indexing options defined.
 * <p>
 * Memory tracking is a key concern in this class. Four properties are provided for checking the amount of memory
 * consumed, how regularly to check the memory, and (optional) maximums on the amount of memory that
 * can be used for the postings, or on the number of documents before a flush is comitted.
 * <p>
 * <b>Properties:</b>
 * <ul>
 * <li><tt>memory.reserved</tt> - amount of free memory threshold before a run is committed. 
 * Default is 50 000 000 (50MB) and 100 000 000 (100MB) for 32bit and 64bit JVMs respectively.</li>
 * <li><tt>memory.heap.usage</tt> - proportion of max heap allocated to JVM before a run is committed. Default 0.70.</li>
 * <li><tt>indexing.singlepass.max.postings.memory</tt> - maximum amount of memory that the postings can consume before a run is committed.</li>
 * <li><tt>indexing.singlepass.max.documents.flush</tt> - maximum number of documents before a run is committed.</li>
 * <li><tt>docs.check</tt> - interval of how many documents indexed should the amount of free memory be checked. Defaults to 20.</li>
 * </ul> 
 * @author Roi Blanco
 */
public class BasicSinglePassIndexer extends BasicIndexer{


  /** Current document Id */
  protected int currentId = 0;


  protected long maxMemory = 0;  
  
  /** Memory Checker - provides the method for checking to see if
   * the system is running low on memory */
  protected MemoryChecker memoryCheck = null;
  
  /** Number of documents read per memory check */
  protected int docsPerCheck;
  
  protected int maxDocsPerFlush;
  
  /** Runtime system JVM running this instance of Terrier */
  protected static final Runtime runtime = Runtime.getRuntime();


  /** Number of documents read since the memory consumption was last checked */
  protected int numberOfDocsSinceCheck = 0;
  /** Number of documents read since the memory runs were last flushed to disk */
  protected int numberOfDocsSinceFlush = 0;
  /** Memory status after flush */
  protected long memoryAfterFlush = -1;
  /** Queue with the file names for the runs in disk */
  protected Queue<String[]> fileNames = new LinkedList<String[]>();
  /** Number of the current Run to be written in disk */
  protected int currentFile = 0;
  /** Structure that keeps the posting lists in memory */
  protected MemoryPostings mp;
  /** Structure for merging the run */
  protected RunsMerger merger;


  /** Number of documents indexed */
  protected int numberOfDocuments = 0;
  /** Number of tokens indexed */
  protected long numberOfTokens = 0;
  /** Number of unique terms indexed */
  protected int numberOfUniqueTerms = 0;
  /** Number of pointers indexed */
  protected long numberOfPointers = 0;
  /** what class should be used to read the generated inverted index? */
  protected String invertedIndexClass = org.terrier.structures.InvertedIndex.class.getName();
  protected String basicInvertedIndexPostingIteratorClass = BasicIterablePosting.class.getName();
  protected String fieldInvertedIndexPostingIteratorClass = FieldIterablePosting.class.getName();
  /** what class should be used to read the inverted index as a stream? */
  protected String invertedIndexInputStreamClass = org.terrier.structures.InvertedIndexInputStream.class.getName();
  /**
   * Constructs an instance of a BasicSinglePassIndexer, using the given path name
   * for storing the data structures.
   * @param pathname String the path where the datastructures will be created. This is assumed to be
   * absolute.
   * @param prefix String the prefix of the index, usually "data".
   */
  public BasicSinglePassIndexer(String pathname, String prefix) {
    super(pathname, prefix);
    //delay the execution of init() if we are a parent class
        if (this.getClass() == BasicSinglePassIndexer.class) 
            init();
  }


  /** Protected do-nothing constructor for use by child classes */
  protected BasicSinglePassIndexer(long a, long b, long c) {
    super(a,b,c);
  }




  @Override
  public void createDirectIndex(Collection[] collections) {
    createInvertedIndex(collections);
  }
  @Override
  public void createInvertedIndex(){}








  /**
   *  Builds the inverted file and lexicon file for the given collections
   * Loops through each document in each of the collections,
   * extracting terms and pushing these through the Term Pipeline
   * (eg stemming, stopping, lowercase).
   *  @param collections Collection[] the collections to be indexed.
   */
  public void createInvertedIndex(Collection[] collections) {
    ////logger.info("Creating IF (no direct file)..");
    long startCollection, endCollection;
    fileNames = new LinkedList<String[]>();  
    numberOfDocuments = currentId = numberOfDocsSinceCheck = numberOfDocsSinceFlush = numberOfUniqueTerms = 0;
    numberOfTokens = numberOfPointers = 0;
    createMemoryPostings();
    currentIndex = Index.createNewIndex(path, prefix);
    docIndexBuilder = new DocumentIndexBuilder(currentIndex, "document");
    metaBuilder = createMetaIndexBuilder();
    
    emptyDocIndexEntry = (FieldScore.FIELDS_COUNT > 0) ? new FieldDocumentIndexEntry(FieldScore.FIELDS_COUNT) : new SimpleDocumentIndexEntry();
    
    MAX_DOCS_PER_BUILDER = Integer.parseInt(ApplicationSetup.getProperty("indexing.max.docs.per.builder", "0"));
    maxMemory = Long.parseLong(ApplicationSetup.getProperty("indexing.singlepass.max.postings.memory", "0"));
    final boolean boundaryDocsEnabled = BUILDER_BOUNDARY_DOCUMENTS.size() > 0;
    final int collections_length = collections.length;
    boolean stopIndexing = false;
    System.gc();
    memoryAfterFlush = runtime.freeMemory();
  //  logger.debug("Starting free memory: "+memoryAfterFlush/1000000+"M");


    for(int collectionNo = 0; ! stopIndexing && collectionNo < collections_length; collectionNo++)
    {
      Collection collection = collections[collectionNo];
      startCollection = System.currentTimeMillis();
      while(collection.nextDocument())
      //while(collection.hasNext())
      {
        /* get the next document from the collection */
        //Document doc = collection./next();
        Document doc = collection.getDocument();
        if (doc == null)
          continue;
        //numberOfDocuments++;
        /* setup for parsing */
        createDocumentPostings();


        String term; //term we're currently processing
        numOfTokensInDocument = 0;
        //get each term in the document
        while (!doc.endOfDocument()) {


          if ((term = doc.getNextTerm())!=null && !term.equals("")) {
            termFields = doc.getFields();
            /* pass term into TermPipeline (stop, stem etc) */
            pipeline_first.processTerm(term);
            /* the term pipeline will eventually add the term to this object. */
          }
          if (MAX_TOKENS_IN_DOCUMENT > 0 &&
              numOfTokensInDocument > MAX_TOKENS_IN_DOCUMENT)
            break;
        }
        //if we didn't index all tokens from document,
        //we need to get to the end of the document.
        while (!doc.endOfDocument())
          doc.getNextTerm();
        
        pipeline_first.reset();
        /* we now have all terms in the DocumentTree, so we save the document tree */
        try
        {
          if (termsInDocument.getDocumentLength() == 0)
          {  /* this document is empty, add the minimum to the document index */
            indexEmpty(doc.getAllProperties());
            if (IndexEmptyDocuments)
            {
              currentId++;
              numberOfDocuments++;
            }
          }
          else
          {  /* index this document */
            numberOfTokens += numOfTokensInDocument;
            indexDocument(doc.getAllProperties(), termsInDocument);
          }
        }
        catch (Exception ioe)
        {
    //      logger.error("Failed to index "+doc.getProperty("docno"),ioe);
        }


        if (MAX_DOCS_PER_BUILDER>0 && numberOfDocuments >= MAX_DOCS_PER_BUILDER)
        {
          stopIndexing = true;
          break;
        }


        if (boundaryDocsEnabled && BUILDER_BOUNDARY_DOCUMENTS.contains(doc.getProperty("docno")))
        {
    //      //logger.warn("Document "+doc.getProperty("docno")+" is a builder boundary document. Boundary forced.");
          stopIndexing = true;
          break;
        }
        termsInDocument.clear();
      }
      
      try{
        forceFlush();
        endCollection = System.currentTimeMillis();
        long partialTime = (endCollection-startCollection)/1000;
    //    //logger.info("Collection #"+collectionNo+ " took "+partialTime+ " seconds to build the runs for "+numberOfDocuments+" documents\n");
              
        
        
        docIndexBuilder.finishedCollections();
        if (FieldScore.FIELDS_COUNT > 0)
        {
          currentIndex.addIndexStructure("document-factory", FieldDocumentIndexEntry.Factory.class.getName(), "java.lang.String", "${index.inverted.fields.count}");
        }
        else
        {
          currentIndex.addIndexStructure("document-factory", SimpleDocumentIndexEntry.Factory.class.getName(), "", "");
        }
        metaBuilder.close();
        currentIndex.flush();
        
    //    //logger.info("Merging "+fileNames.size()+" runs...");
        startCollection = System.currentTimeMillis();
        
        performMultiWayMerge();
        currentIndex.flush();
        endCollection = System.currentTimeMillis();
    //    //logger.info("Collection #"+collectionNo+" took "+((endCollection-startCollection)/1000)+" seconds to merge\n ");
    //    //logger.info("Collection #"+collectionNo+" total time "+( (endCollection-startCollection)/1000+partialTime));
        long secs = ((endCollection-startCollection)/1000);
      //  if (secs > 3600)
            //       //logger.info("Rate: "+((double)numberOfDocuments/((double)secs/3600.0d))+" docs/hour");
      } catch (Exception e) {
    //    logger.error("Problem finishing index", e);
      }
    }
    finishedInvertedIndexBuild();
  }


  /** check to see if a flush is required, and perform if necessary */
  protected void checkFlush() throws IOException
  {
    if(docsPerCheck != numberOfDocsSinceCheck)
      return;
    numberOfDocsSinceCheck = 0;
    final long consumed = mp.getMemoryConsumption();
    boolean doFlush = false;
    final boolean memCheck = memoryCheck.checkMemory();
    String msg = null; 
  //  logger.debug(msg="maxAllowedMemory="+maxMemory + " consumed="+consumed + " maxDocsPerFlush="+maxDocsPerFlush
  //      +" numberOfDocsSinceFlush="+numberOfDocsSinceFlush + " memcheck="+ memCheck);
    if (memCheck)
    {
      doFlush = true;
      msg += " (memory check threshold hit)";
    }
    if (maxDocsPerFlush > 0 && numberOfDocsSinceFlush >= maxDocsPerFlush)
    {
      msg += " (doc threhold hit)";
      doFlush = true;
    }
    if (maxMemory > 0 && consumed > maxMemory )
    {
      msg += " (posting memory threshold hit)";
      doFlush = true;
    }
    if (doFlush)
    {
  //    //logger.info("Flush forced: " + msg);
      forceFlush();
    }
  }
  
  @edu.umd.cs.findbugs.annotations.SuppressWarnings(
      value="DM_GC",
      justification="Forcing GC is an essential part of releasing" +
          "memory for further indexing")
  /** causes the posting lists built up in memory to be flushed out */
  protected void forceFlush() throws IOException
  {  
    mp.finish(finishMemoryPosting());
    System.gc();
    createMemoryPostings();
    memoryCheck.reset();
    numberOfDocsSinceFlush = 0;  
  }
  
  /**
   * {@inheritDoc}.
   * This implementation only places content in the runs in memory, which will eventually be flushed to disk.
   */
  @Override
  protected void indexDocument(Map<String,String> docProperties, DocumentPostingList termsInDocument) throws Exception
  {
    if (termsInDocument.getDocumentLength() > 0) {
      numberOfDocsSinceCheck++;
      numberOfDocsSinceFlush++;
      
      checkFlush();
      mp.addTerms(termsInDocument, currentId);
      DocumentIndexEntry die = termsInDocument.getDocumentStatistics();
      docIndexBuilder.addEntryToBuffer((FieldScore.FIELDS_COUNT > 0) ? die : new SimpleDocumentIndexEntry(die));
      metaBuilder.writeDocumentEntry(docProperties);
      currentId++;
      numberOfDocuments++;
    }
  }


  /**
   * Adds the name of the current run + partial lexicon to be flushed in disk.
   * @return the two dimensional String[] array with the names of the run and partial lexicon to write.
   */
  protected String[] finishMemoryPosting(){
    String[] names = new String[2];
    names[0] = fileNameNoExtension + "Run."+(currentFile);
    names[1] = fileNameNoExtension + "Run."+(currentFile++)+".str";
    fileNames.add(names);
    return names;
  }


  /**
   * Uses the merger class to perform a k multiway merge
   * in a set of previously written runs.
   * The file names and the number of runs are given by the private queue
   */
  public void performMultiWayMerge() throws IOException {
    String[][] _fileNames = getFileNames();
    this.currentIndex.setIndexProperty("max.term.length", ApplicationSetup.getProperty("max.term.length", ""+20));
    LexiconOutputStream<String> lexStream = new FSOMapFileLexiconOutputStream(this.currentIndex, "lexicon", 
        (super.numFields > 0 ? FieldLexiconEntry.Factory.class : BasicLexiconEntry.Factory.class));
    
    try{
      if (useFieldInformation)
        createFieldRunMerger(_fileNames);
      else
        createRunMerger(_fileNames);
      merger.beginMerge(_fileNames.length, path + ApplicationSetup.FILE_SEPARATOR + prefix +  ".inverted.bf");
      while(!merger.isDone()){
        merger.mergeOne(lexStream);
      }
      merger.endMerge(lexStream);
      lexStream.close();
      //the constructor for FieldLexiconEntry is wrong - replace it
      if (super.numFields > 0)
      {
        this.currentIndex.addIndexStructure("lexicon-valuefactory", FieldLexiconEntry.Factory.class.getName(), "java.lang.String", "${index.inverted.fields.count}");
      }
      numberOfUniqueTerms = merger.getNumberOfTerms();
      numberOfPointers = merger.getNumberOfPointers();
      // Delete the runs files
      for(int i = 0; i < _fileNames.length; i++)
      {
        Files.delete(_fileNames[i][0]);
        Files.delete(_fileNames[i][1]);
      }
      currentIndex.setIndexProperty("num.Terms", ""+numberOfUniqueTerms);
      currentIndex.setIndexProperty("num.Pointers", ""+numberOfPointers);
      currentIndex.setIndexProperty("num.Tokens", ""+numberOfTokens);
      currentIndex.addIndexStructure(
          "inverted",
          invertedIndexClass,
          "org.terrier.structures.Index,java.lang.String,org.terrier.structures.DocumentIndex,java.lang.Class", 
          "index,structureName,document,"+ 
            (FieldScore.FIELDS_COUNT > 0
              ? fieldInvertedIndexPostingIteratorClass
              : basicInvertedIndexPostingIteratorClass ));
      currentIndex.addIndexStructureInputStream(
                    "inverted",
                    invertedIndexInputStreamClass,
                    "org.terrier.structures.Index,java.lang.String,java.util.Iterator,java.lang.Class",
                    "index,structureName,lexicon-entry-inputstream,"+
                      (FieldScore.FIELDS_COUNT > 0
                        ? fieldInvertedIndexPostingIteratorClass
              : basicInvertedIndexPostingIteratorClass ));
      currentIndex.setIndexProperty("index.inverted.fields.count", ""+FieldScore.FIELDS_COUNT );
      currentIndex.setIndexProperty("index.inverted.fields.names", ArrayUtils.join(FieldScore.FIELD_NAMES, ","));
    }catch(Exception e){
  //    logger.error("Problem in performMultiWayMerge", e);
    }
  }


  /**
   * @return the String[][] structure with the name of the runs files and partial lexicons.
   */
  protected String[][] getFileNames(){
    String[][] files =  new String[fileNames.size()][2];
    int i = 0;
    while(!fileNames.isEmpty()){
      files[i++] = fileNames.poll();
    }
    return files;
  }




  /**
   * Hook method that creates a FieldRunMerger instance
   * @throws IOException if an I/O error occurs.
   */
  protected void createFieldRunMerger(String[][] files) throws Exception{
    merger = new RunsMerger(new FileRunIteratorFactory(files, FieldPostingInRun.class, super.numFields));
  }




  /**
   * Hook method that creates a RunsMerger instance
   * @throws IOException if an I/O error occurs.
   */
  protected void createRunMerger(String[][] files) throws Exception{
    merger = new RunsMerger(new FileRunIteratorFactory(files, 
        useFieldInformation ? FieldPostingInRun.class : SimplePostingInRun.class, 0));
  }


  /**
   * Hook method that creates the right type of MemoryPostings class.
   */
  protected void createMemoryPostings(){
    if (useFieldInformation)
      mp = new FieldsMemoryPostings();
    else
      mp = new MemoryPostings();
  }


  @Override
  protected void load_indexer_properties() {
    super.load_indexer_properties();
    docsPerCheck = ApplicationSetup.DOCS_CHECK_SINGLEPASS;
    maxDocsPerFlush = Integer.parseInt(ApplicationSetup.getProperty("indexing.singlepass.max.documents.flush", "0"));
    memoryCheck = new RuntimeMemoryChecker();
  //  //logger.info("Checking memory usage every " + docsPerCheck + " maxDocPerFlush=" + maxDocsPerFlush);
  }




}
Source Code of org.terrier.indexing.BasicSinglePassIndexer

Related Classes of org.terrier.indexing.BasicSinglePassIndexer